package com.ppm.common.utils.file;

import com.ppm.common.utils.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebFileUtils {


     public static Logger log = LoggerFactory.getLogger(WebFileUtils.class);

     public static Map<String,Set<String>> getMaterialFileUrlMap (String url) throws Exception{

         File file = getFileByUrl(url);
         String content = readFile(file);
         String html = filterH5Content(content);
         Set<String> imageSrcSet = getImageSrc(html);
         Set<String> videoSrcSet = getVideoSrc(html);
         Set<String> audioSrcSet = getAudioSrc(html);
         Map m = new LinkedHashMap<String,Set<String>>();
         m.put("img", imageSrcSet);
         m.put("video", videoSrcSet);
         m.put("audio", audioSrcSet);

         return m;

     } 


    /**
     * 根据文件的url地址解析成file对象
     * @param url
     * @return
     */
    public static File getFileByUrl(String url) throws Exception{
        getFileName(url);
        String fileName = getFileName(url);

        File file = null;
        InputStream is = null;
        FileOutputStream fos = null;
        URL urlfile;
        try {
            file = File.createTempFile("netUrl", fileName);

            urlfile = new URL(url);
            URLConnection connection = urlfile.openConnection();
            //设置超时时间
            connection.setConnectTimeout(60000);
            connection.setReadTimeout(60000);

            is = connection.getInputStream();
            fos = new FileOutputStream(file);
            byte[] buffer = new byte[8192];
            int count = 0;
            while ((count = is.read(buffer)) > 0){
                fos.write(buffer, 0, count);
            }
        }catch (Throwable t){
            log.error("获取文件失败,url:"+url,t);
            throw new Exception(t);
        }finally {
            try {
                if(is != null)
                    is.close();
                if(fos != null)
                    fos.close();

            }catch (Throwable t){

                t.printStackTrace();;
            }

        }
        return file;
    }


    /**
     * 将文件内容转换成字符串
     * @param file
     * @return
     * @throws Exception
     */
    public static String readFile(File file) throws Exception {
        StringBuffer fileContent = new StringBuffer();
        try (InputStreamReader read = new InputStreamReader(new FileInputStream(file),"UTF-8");
             BufferedReader reader=new BufferedReader(read)){
            String line;
            while ((line = reader.readLine()) != null) {
                //将读取到的字符拼接
                fileContent.append(line);
            }
        } catch (Throwable e) {
            throw new Exception(e);
        }
        return fileContent.toString();
    }

    /**
     * 过滤掉html中不需要的标签
     * @param html
     * @return
     */
    public static String filterH5Content(String html){
        html = html.replaceAll("\\<head>[\\s\\S]*?</head>(?i)", "");//去掉head
        html = html.replaceAll("\\<input[\\s\\S]*?/>(?i)", "");//去掉head
        html = html.replaceAll("\\<!--[\\s\\S]*?-->", "");//去掉注释
        html = html.replaceAll("\\<![\\s\\S]*?>", "");
        html = html.replaceAll("\\<style[^>]*>[\\s\\S]*?</style>(?i)", "");//去掉样式
        html = html.replaceAll("\\<script[^>]*>[\\s\\S]*?</script>(?i)", "");//去掉js
        html = html.replaceAll("\\<w:[^>]+>[\\s\\S]*?</w:[^>]+>(?i)", "");//去掉word标签
        html = html.replaceAll("\\<xml>[\\s\\S]*?</xml>(?i)", "");
        html = html.replaceAll("\\<html[^>]*>|<body[^>]*>|<div[^>]*>|</html>|</body>|</div>(?i)", "");
//      html = html.replaceAll("\\\r\n|\n|\r", " ");//去掉换行
        html = html.replaceAll("\\<br[^>]*>(?i)", "\n\r");
        html = html.replaceAll("\\<h1>[\\s\\S]*?</h1>(?i)", "");//去掉h1标签
        html = html.replaceAll("<p style=\\\"text-align:center\\\"><img id=\\\"QRCode\\\" style=\\\"width: 94%\\\"></p>", "");//该标签
//        int index =  html.indexOf("<p style=\"text-align:center\"><img id=\"QRCode\" style=\"width: 94%\"></p>");
//        if(index != -1){
//
//            html = html.substring(0, index);
//        }
        return html;
    }

    /**
     * 解析获取到html中img标签的src值
     * @param htmlCode
     * @return
     */
    public static Set<String> getImageSrc(String htmlCode) {
        Set<String> imageSrcSet = new TreeSet<String>();
        String regular = "<img(.*?)src=\"(.*?)\"";
        String img_pre = "(?i)<img(.*?)src=\"";
        String img_sub = "\"";
        Pattern p = Pattern.compile(regular, Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(htmlCode);
        String src = null;
        while (m.find()) {
            src = m.group();
            src = src.replaceAll(img_pre, "").replaceAll(img_sub, "").trim();
            imageSrcSet.add(src);
        }
        return imageSrcSet;
    }

    /**
     * 解析获取到html中video标签的src值
     * @param htmlCode
     * @return
     */
    public static Set<String> getVideoSrc(String htmlCode) {
        Set<String> videoSrcSet = new TreeSet<String>();
        String regular = "<video(.*?)src=\"(.*?)\"";
        String video_pre = "(?i)<video(.*?)src=\"";
        String video_sub = "\"";
        Pattern p = Pattern.compile(regular, Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(htmlCode);
        String src = null;
        while (m.find()) {
            src = m.group();
            src = src.replaceAll(video_pre, "").replaceAll(video_sub, "").trim();
            videoSrcSet.add(src);
        }
        return videoSrcSet;
    }

    /**
     * 解析获取到html中audio标签的src值
     * @param htmlCode
     * @return
     */
    public static Set<String> getAudioSrc(String htmlCode) {
        Set<String> audioSrcSet = new TreeSet<String>();
        String regular = "<audio(.*?)src=\"(.*?)\"";
        String audio_pre = "(?i)<audio(.*?)src=\"";
        String audio_sub = "\"";
        Pattern p = Pattern.compile(regular, Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(htmlCode);
        String src = null;
        while (m.find()) {
            src = m.group();
            src = src.replaceAll(audio_pre, "").replaceAll(audio_sub, "").trim();
            audioSrcSet.add(src);
        }
        return audioSrcSet;
    }

    /**
     * 根据url获取文件名
     * @param url
     * @return
     */
    public static String getFileName(String url){

        if (StringUtils.isEmpty(url)) {
            return url;
        }

        String newUrl = url.split("[?]")[0];
        String[] bb = newUrl.split("/");

        return bb[bb.length - 1];

    }




}
